Matplotlib 官方文件: https://matplotlib.org/
Seaborn 官方文件: https://seaborn.pydata.org/
以上兩個是在進行資料視覺化時,常使用的兩個套件,Matplotlib的自由度高,Seaborn呈現方式多元成熟,兩者能夠互相搭配使用
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
這是份包含不同類別鋼鐵的資料,包含長度、亮度、面積等資訊
鋼鐵的類別為: Pastry, Z_Scratch, K_Scatch, Stains, Dirtiness, Bumps, Other_Faults等,我們希望了解各種鋼鐵類別間,是否有因為不同的屬性差異而造成不同的分類結果,或者是屬性間的相關性,因此可以透過資料視覺化來先進行初步的了解
df = pd.read_csv('faults.csv')
df.head()
此處進行簡單的資料欲處理,主要是將資料從dummy variable換成分類,並且移除一些不需要的欄位
from xlsxwriter.utility import xl_rowcol_to_cell
conditions=[(df['Pastry'] == 1) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 0)& (df['Stains'] == 0)& (df['Dirtiness'] == 0)& (df['Bumps'] == 0)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 1)& (df['K_Scatch'] == 0)& (df['Stains'] == 0)& (df['Dirtiness'] == 0)& (df['Bumps'] == 0)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 1)& (df['Stains'] == 0)& (df['Dirtiness'] == 0)& (df['Bumps'] == 0)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 0)& (df['Stains'] == 1)& (df['Dirtiness'] == 0)& (df['Bumps'] == 0)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 0)& (df['Stains'] == 0)& (df['Dirtiness'] == 1)& (df['Bumps'] == 0)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 0)& (df['Stains'] == 0)& (df['Dirtiness'] == 0)& (df['Bumps'] == 1)& (df['Other_Faults'] == 0),
(df['Pastry'] == 0) & (df['Z_Scratch'] == 0)& (df['K_Scatch'] == 0)& (df['Stains'] == 0)& (df['Dirtiness'] == 0)& (df['Bumps'] == 0)& (df['Other_Faults'] == 1)]
choices = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
df['class'] = np.select(conditions, choices)
#Dropping redundant column
#Dropping Hot Encoding Classes
drp_cols=['TypeOfSteel_A400','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
df.drop(choices, inplace=True,axis = 1)
df
本次會介紹matplotlib當中的五種圖形的使用與語法,並且以上述鋼鐵資料集來做為範例
plt.hist(df["Minimum_of_Luminosity"], bins= 10, color='c') #畫出直方圖,bins為區間設定為25
plt.xlabel("Minimum_of_Luminosity") # .xlabel在所有圖形中,都作為x軸的屬性
plt.ylabel("frequency") # .ylabel在所有圖形中,都作為y軸的屬性
plt.title("Minimum_of_Luminosity") #.title為替圖片取名
plt.show()
# 先將各類別鋼鐵資料分別選擇出來
df1 = df[df['class'] == 'Z_Scratch']
df2 = df[df['class'] == 'K_Scatch']
df3 = df[df['class'] == 'Stains']
df4 = df[df['class'] == 'Dirtiness']
df5 = df[df['class'] == 'Bumps']
df6 = df[df['class'] == 'Other_Faults']
df7 = df[df['class'] == 'Parstry']
# 利用stacked==True來使各直方圖相疊加,此處注意的是data必須用陣列傳入
plt.hist([df1["Minimum_of_Luminosity"], df2["Minimum_of_Luminosity"],
df3["Minimum_of_Luminosity"], df4["Minimum_of_Luminosity"],
df5["Minimum_of_Luminosity"], df6["Minimum_of_Luminosity"],
df7["Minimum_of_Luminosity"]], label = ['Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults','Parstry'],
stacked=True)
plt.xlabel('Minimum_of_Luminosity')
plt.legend() #圖例
plt.show()
colors=['lightsteelblue', 'cornflowerblue', 'royalblue', 'midnightblue', 'navy', 'darkblue', 'mediumblue']
count = df["class"].value_counts(sort = False)
y = []
for i in choices:
y.append(count[i])
plt.bar(df['class'].unique(), y, color=colors, width=0.5) # x軸傳入class類別,y軸計算各類別資料數量
plt.xlabel('class')
plt.ylabel('amount')
plt.title('Class data amount')
plt.show()
plt.bar(df['class'].unique(), y, color=colors, width=0.5) # x軸傳入class類別,y軸計算各類別資料數量
plt.axhline(y=200, c="r", ls="--", lw=2) # axhline y=200代表設定標準
plt.show()
plt.scatter('Minimum_of_Luminosity', "Maximum_of_Luminosity", data = df[df["class"]== "Z_Scratch"], alpha = 0.2)
plt.xlabel('Minimum_of_Luminosity')
plt.ylabel('Maximum_of_Luminosity')
plt.show()
df1 = df[df['class'] == 'Pastry']
df2 = df[df['class'] == 'Z_Scratch']
df3 = df[df['class'] == 'Stains']
plt.scatter('X_Maximum', "Y_Maximum", data = df1, alpha = 0.2, label="Pastry") # 此處用label先標記該點圖的屬性
plt.scatter('X_Maximum', "Y_Maximum", data = df2, alpha = 0.2, label="Z_Scratch") # 此處用label先標記該點圖的屬性
plt.scatter('X_Maximum', "Y_Maximum", data = df3, alpha = 0.2, label="Stains") # 此處用label先標記該點圖的屬性
plt.legend() #將前面用label標記的點以圖例的方式表示
plt.xlabel('X_Maximum')
plt.ylabel('Y_Maximum')
plt.show()
plt.boxplot([df[df["class"] == "Pastry"].Minimum_of_Luminosity,
df[df["class"] == "Z_Scratch"].Minimum_of_Luminosity,
df[df["class"] == "Stains"].Minimum_of_Luminosity],
labels = ["Pastry", "Z_Scratch", "Stains"])
plt.ylabel('Minimum_of_Luminosity')
plt.xlabel('class')
plt.title('Box plot of Minimum_of_Luminosity')
plt.show()
month = [1,2,3,4,5,6,7,8,9,10,11,12]
stock_tsmcc = [255,246,247.5,227,224,216.5,246,256,262.5,234,225.5,225.5]
stock_foxconnn = [92.2,88.1,88.5,82.9,85.7,83.2,83.8,80.5,79.2,78.8,71.9,70.8]
plt.plot(month, stock_tsmcc, 's-',color = 'r', label="TSMC")
plt.plot(month,stock_foxconnn,'o-',color = 'g', label="FOXCONN")
plt.title("TSMC_FOXCONN")
plt.xlabel("month")
plt.ylabel("price")
plt.legend()
plt.show()
import seaborn as sns
sns.distplot(df["Minimum_of_Luminosity"], kde=True, bins=25)
count = df["class"].value_counts(sort = False)
y = []
for i in choices:
y.append(count[i])
sns.barplot(x = df["class"].unique(), y = y )
sns.countplot(x = df['class'])
sns.stripplot(x= df['class'], y = df["Y_Minimum"], jitter=1)
sns.swarmplot(x= df['class'], y = df["Y_Minimum"])
sns.jointplot(x='X_Maximum',y='Y_Minimum',data=df,kind='reg') # kind的選擇有scatter、reg、resid、kde、hex
sns.boxplot(x= 'class', y = 'Minimum_of_Luminosity', data = df)
sns.violinplot(x= 'class', y = 'Minimum_of_Luminosity', data = df)
sns.pairplot(df)